For this project, we looked at a subset of the World Bank Open Data datasets, choosing the World Development Indicators dataset. This dataset contains data from over 170 countries, and consists of approximately 64000 rows and 70 columns.
Visual inspection of the data in a spreadsheet showed that for most of the countries, the data was most complete for the years 2000 to 2017, so we excluded the years before 2000 and after 2017.
We compared the quality of the data in developed countries to the overall data. Figs 1 & 2 The quality of the data as a whole was poor, with only 60% of the roads being complete and with 25% of the observations been missing.
In contrast, when we grouped the data into the G7 all the row2s were complete. Since the information that we wanted to explore was most closely linked to developed nations, we decided to explore the data from the group of seven - Canada, France, Germany, Italy, Japan, United Kingdom, and the United States.
We looked at the correlations between several of the indicators that were of interest. Fig 3 Gross domestic products had high correlation with communications and computer exports, charges for intellectual property, adjusted savings for mineral depletion, and consumption of fixed capital. Intellectual property sales had a high correlation with education.
The mirroring of GDP and sales from intellectual property can be seen in graphical forms. Figs 4 & 5
variables of interest:
gdf NY.GDP.MKTP.CD GDP (current US$)
com BX.GSR.CMCP.ZS Communications, computer, etc. (% of service exports, BoP)
ip BX.GSR.ROYL.CD Charges for the use of intellectual property, receipts (BoP, current US$)
ed NY.ADJ.AEDU.CD Adjusted savings: education expenditure (current US$)
inc NY.GSR.NFCY.CD Net income from abroad (current US$)
“NY.ADJ.NNAT.CD”,“Adjusted savings: net national savings (current US$)”
“NY.ADJ.DPEM.CD”,“Adjusted savings: particulate emission damage (current US$)”
“NY.ADJ.DNGY.CD”,“Adjusted savings: energy depletion (current US$)”
“NY.ADJ.DMIN.CD”,“Adjusted savings: mineral depletion (current US$)”
“NY.ADJ.DFOR.CD”,“Adjusted savings: net forest depletion (current US$)”
“NY.ADJ.DFOR.CD”,“Adjusted savings: net forest depletion (current US$)”
“NY.ADJ.DKAP.CD”,“Adjusted savings: consumption of fixed capital (current US$)”
set working directory
setwd('/Users/Hutch/Desktop/mung.joint')
getwd()
## [1] "/Users/Hutch/Desktop/mung.joint"
knitr::opts_chunk$set(message = FALSE)
# install.packages('data.table')
# install.packages('reshape2')
# install.packages('ggplot2')
# install.packages('tidyverse')
# install.packages('plotly')
# install.packages('listviewer')
# install.packages('gapminder')
# import Scotch
library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(listviewer)
library(gapminder)
library(ggvis)
##
## Attaching package: 'ggvis'
## The following objects are masked from 'package:plotly':
##
## add_data, hide_legend
## The following object is masked from 'package:ggplot2':
##
## resolution
#install.packages("corrplot")
# install.packages("RColorBrewer")
library(corrplot)
## corrplot 0.84 loaded
library (dplyr)
library(RColorBrewer)
library(DataExplorer)
library(grid)
Please note that I changed the name of the data set to MP, standing for the mung project.
It is a copy of the original data set - API_3_DS2_en_csv_v2_1567828.csv
import data set (imported from my desktop)
# World bank dataset Developement indicators
mp.complete <- fread("/Users/Hutch/Desktop/mung.joint/MP.csv", header=TRUE)
.Visual inspection of the data showed that for most of the countries, the data was most complete for the years 2000 to 2017.
trim data of empty rows. left with years [2000-2017]
# will
# Remove year columns for years 1960- 1999
# leave 2000 - 2017
#remove col 3 so my code matches Dr. Reba's code
#remove early empty years
mp.2017<- mp.complete[,-c(5:44), with=F]
#view(mp.2017)
#ncol(mp.2017)
#remove 2018:2020 plus weird extra column
mp.2017<- mp.2017[,-c(23:26), with=F]
#remvove col 3 because Dr. Reba did
mp.2017 <- mp.2017[,-c(3)]
#view(mp.2017)
using one of the new packages whose name I forget but that I will look up
introduce(mp.2017)
plot_intro(mp.2017)
#dev.print(pdf, "Data 2000 - 2017 Errors.pdf")
# copies the plot to a thePDFfile
str(mp.2017)
## Classes 'data.table' and 'data.frame': 64680 obs. of 21 variables:
## $ Country Name : chr "Aruba" "Aruba" "Aruba" "Aruba" ...
## $ Country Code : chr "ABW" "ABW" "ABW" "ABW" ...
## $ Indicator Code: chr "PA.NUS.PRVT.PP" "PA.NUS.PPPC.RF" "PA.NUS.PPP" "PA.NUS.ATLS" ...
## $ 2000 : num 1.477 0.644 1.152 1.79 NA ...
## $ 2001 : num 1.478 0.665 1.191 1.79 NA ...
## $ 2002 : num 1.503 0.685 1.226 1.79 NA ...
## $ 2003 : num 1.524 0.686 1.229 1.79 NA ...
## $ 2004 : num 1.521 0.683 1.222 1.79 NA ...
## $ 2005 : num 1.521 0.684 1.225 1.79 NA ...
## $ 2006 : num 1.527 0.684 1.224 1.79 NA ...
## $ 2007 : num 1.565 0.705 1.263 1.79 NA ...
## $ 2008 : num 1.642 0.727 1.302 1.79 NA ...
## $ 2009 : num 1.613 0.734 1.314 1.79 NA ...
## $ 2010 : num 1.62 0.721 1.29 1.79 NA ...
## $ 2011 : num 1.638 0.728 1.303 1.79 NA ...
## $ 2012 : num 1.543 0.736 1.318 1.79 NA ...
## $ 2013 : num 1.513 0.718 1.285 1.79 NA ...
## $ 2014 : num 1.509 0.731 1.309 1.79 NA ...
## $ 2015 : num 1.528 0.761 1.362 1.79 NA ...
## $ 2016 : num 1.497 0.757 1.355 1.79 NA ...
## $ 2017 : num 1.48 0.755 1.351 1.79 NA ...
## - attr(*, ".internal.selfref")=<externalptr>
nrow(mp.2017)
## [1] 64680
ncol(mp.2017)
## [1] 21
64680 rows
# Create a tidier version (features as columns, observations as rows)
#based on code from Dr. R
# Collapse year columns into one
mp.ts <- gather(mp.2017, Year, "Value", 4:21)
#view(mp.ts)
# Standardize names because some of the names have blank spaces
names(mp.ts) <- make.names(names(mp.ts))
mp.ts <- spread(mp.ts, Indicator.Code, Value)
# Spread Indicator.Code to multiple columns
#view(mp.ts)
#nrow(mp.ts)
#ncol(mp.ts)
filter out the G7
g7 = filter(mp.ts, Country.Name == 'Canada' | Country.Name == 'Japan' | Country.Name == 'Italy' | Country.Name == 'Germany'| Country.Name == 'France'| Country.Name == 'United States' | Country.Name == 'United Kingdom' )
Other ways of looking for bad data. Most of it is complete for these countries.
# Check missingness proportion (from validation notes)
na.prop <- apply(g7, 2, function(x) sum(is.na(x)/length(x)) )
#view(na.prop)
# We select some variables of interest
#na.prop[order(na.prop)][4:20]
# keep all variable
summary(na.prop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08477 0.00000 1.00000
simple plot year v gdp all countries
#X11()
par(mar= c(1,1,1,1))
# Plot GDP per year
plot(
x = jitter(as.numeric(g7$Year)),
y = g7$NY.GDP.MKTP.CD,
col = as.factor(g7$Country.Name),
xlim = c(2003, 2020),
pch = 19
)
ggplot
#X11()
g7 = filter(mp.ts, Country.Name == 'Canada' | Country.Name == 'Japan' | Country.Name == 'Italy' | Country.Name == 'Germany'| Country.Name == 'France'| Country.Name == 'United States' | Country.Name == 'United Kingdom' )
x =as.numeric(g7$Year)
y = g7$NY.GDP.MKTP.CD
gp <- g7 %>%
ggplot(aes(x,y, col= as.factor(Country.Name )))+ geom_point() +
xlab("Countries")+
ggtitle('GDP 2009-2017')+
labs(col='Country')
gp
#dev.print(pdf, "G7 GDP.pdf")
review of interesting columns
GDP NY.GDP.MKTP.CD GDP (current US$)
COM BX.GSR.CMCP.ZS Communications, computer, etc. (% of service exports, BoP)
INTPROP BX.GSR.ROYL.CD Charges for the use of intellectual property, receipts (BoP, current US$)
EDUC NY.ADJ.AEDU.CD Adjusted savings: education expenditure (current US$)
TRADEINC NY.GSR.NFCY.CD Net income from abroad (current US$)
NY.GDP.MKTP.CD, BX.GSR.CMCP.ZS, BX.GSR.ROYL.CD, NY.ADJ.AEDU.CD, NY.GSR.NFCY.CD, NY.GSR.NFCY.CD, NY.ADJ.DPEM.CD, NY.ADJ.DNGY.CD, NY.ADJ.DMIN.CD, NY.ADJ.DFOR.CD, NY.ADJ.DKAP.CD Here are my columns of interest.
ADJSAV “NY.ADJ.NNAT.CD”,“Adjusted savings: net national savings (current US$)”
ADJEMIS “NY.ADJ.DPEM.CD”,“Adjusted savings: particulate emission damage (current US$)”
ADJENER “NY.ADJ.DNGY.CD”,“Adjusted savings: energy depletion (current US$)”
AJDMINERAL “NY.ADJ.DMIN.CD”,“Adjusted savings: mineral depletion (current US$)”
“NY.ADJ.DFOR.CD”,“Adjusted savings: net forest depletion (current US$)” HAS A LOT OF ZEROS
ADJCAP “NY.ADJ.DKAP.CD”,“Adjusted savings: consumption of fixed capital (current US$)”
GDPCAP NY.GDP.PCAP.CD GDF per capita
test <-g7 %>% select( Country.Name, Country.Code, Year,NY.GDP.MKTP.CD, BX.GSR.CMCP.ZS, BX.GSR.ROYL.CD, NY.ADJ.AEDU.CD, NY.GSR.NFCY.CD, NY.ADJ.NNAT.CD, NY.ADJ.DPEM.CD, NY.ADJ.DNGY.CD, NY.ADJ.DMIN.CD,
NY.ADJ.DFOR.CD, NY.ADJ.DKAP.CD, NY.GDP.PCAP.CD)
test2 <- test %>% rename(
GDP = NY.GDP.MKTP.CD,
COMM =BX.GSR.CMCP.ZS,
INTPROP = BX.GSR.ROYL.CD,
EDUC = NY.ADJ.AEDU.CD,
TRADEINC = NY.GSR.NFCY.CD,
ADJSAV = NY.ADJ.NNAT.CD,
ADJEMIS = NY.ADJ.DPEM.CD,
ADJENER = NY.ADJ.DNGY.CD,
AJDMINERAL = NY.ADJ.DMIN.CD,
ADJDEFOR = NY.ADJ.DFOR.CD,
ADJCAP = NY.ADJ.DKAP.CD, GDPCAP = NY.GDP.PCAP.CD )
g.7 <- test2
#view(g.7)
DataExplorer data much cleaner than the whole data set
introduce(g.7)
plot_intro(g.7)
#dev.print(pdf, "G7 Errors.pdf")
if(.Platform$OS.type == "unix") { # Only on unix-alikes, possibly Mac,
## put something like this is your .Rprofile to customize the defaults
setHook(packageEvent("grDevices", "onLoad"),
function(...) grDevices::X11.options(width = 8, height = 6, xpos = 0,
pointsize = 10))
}
making plot interactive with plotly
getwd()
## [1] "/Users/Hutch/Desktop/mung.joint"
fig <- ggplotly(gp)
fig
names(g.7) [1] “Country.Name” “Country.Code” [3] “Year” “GDP”
[5] “COMM” “INTPROP”
[7] “EDUC” “TRADEINC”
[9] “ADJSAV” “ADJEMIS”
[11] “ADJENER” “AJDMINERAL”
[13] “ADJDEFOR” “ADJCAP”
[15] “GDPCAP” cannot get this to work
#x11()
x =as.numeric(g.7$Year)
y = g.7$GDP
gdp7 <- g.7 %>%
ggplot(aes(x,y, col= as.factor(Country.Name )))+ geom_point() +
xlab("Countries")+
ggtitle('GDP 2009-2017')+
labs(col='Country')
gdp7
#dev.print(pdf, "G7 GDP .pdf")
x =as.numeric(g.7$Year)
yy = g.7$INTPROP
ip7 <- g.7 %>%
ggplot(aes(x,yy, col= as.factor(Country.Name )))+ geom_point() +
xlab("Countries")+
ggtitle('Intellectual Property sales')+
labs(col='Country')
ip7
making plot interactive with plotly
gdp7i <- ggplotly(gdp7)
ip7i <- ggplotly(ip7)
x =as.numeric(g.7$Year)
y = g.7$GDP
gdp7 <- g.7 %>%
ggplot(aes(x,y, col= as.factor(Country.Name )))+ geom_point() +
xlab("Countries")+
ggtitle('GDP 2009-2017')+
labs(col='Country')
x =as.numeric(g.7$Year)
yy = g.7$INTPROP
ip7 <- g.7 %>%
ggplot(aes(x,yy, col= as.factor(Country.Name )))+ geom_point() +
xlab("Countries")+
ggtitle('Intellectual Property sales')+
labs(col='Country')
library(grid)
#x11()
plot1 <- gdp7
plot2 <- ip7
#grid.arrange(plot1, plot2, ncol=2)
# ggarrange(plot1,plot2 nrow=1, ncol=2)
#dev.print(pdf, "G7 GDP v. Int Prop.pdf")
# x11()
## plot1 <- plotly(gdp7)
#plot2 <- plotly(ip7)
# grid.arrange(plot1, plot2, ncol=2)
# dev.print(pdf, "G7 GDP v. Int Prop.pdf")
correlatioj package - G7
install.packages(“corrplot”)
x11()
## Warning in system2("otool", c("-L", shQuote(DSO)), stdout = TRUE): running
## command ''otool' -L '/Library/Frameworks/R.framework/Resources/modules/
## R_X11.so'' had status 1
g.7%>% select_if(is.numeric) %>% cor() %>%
corrplot(type = "lower", tl.col = "black",diag = FALSE, addCoef.col = "grey70",
col = colorRampPalette(brewer.pal(11, "RdBu"))(100))
dev.print(pdf, "new G7 cor GDP v. Int Prop.pdf")
## quartz_off_screen
## 2
# install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
x11()
## Warning in system2("otool", c("-L", shQuote(DSO)), stdout = TRUE): running
## command ''otool' -L '/Library/Frameworks/R.framework/Resources/modules/
## R_X11.so'' had status 1
g.7 %>%
select_if(is.numeric) %>%
chart.Correlation()
dev.print(pdf, "chart.cor.pdf")
## quartz_off_screen
## 2
write.csv(g.7,'/Users/Hutch/Desktop/mung.joint/test.csv')